{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {
    "slideshow": {
     "slide_type": "slide"
    }
   },
   "source": [
    "# Simple linear models\n",
    "\n",
    "- \"model_formulas\" is based on examples in Kaplan \"Statistical Modeling\".\n",
    "- \"polynomial_regression\" shows how to work with simple design matrices, like MATLAB's \"regress\" command.\n",
    "\n",
    "Author: Thomas Haslwanter, Date:   Feb-2017"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": false,
    "slideshow": {
     "slide_type": "fragment"
    }
   },
   "outputs": [],
   "source": [
    "%matplotlib inline\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "from urllib.request import urlopen\n",
    "from statsmodels.formula.api import ols\n",
    "import statsmodels.regression.linear_model as sm\n",
    "from statsmodels.stats.anova import anova_lm"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "slideshow": {
     "slide_type": "slide"
    }
   },
   "source": [
    "## Define models through formulas"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": false,
    "slideshow": {
     "slide_type": "fragment"
    }
   },
   "outputs": [],
   "source": [
    "# Get the data\n",
    "inFile = 'swim100m.csv'\n",
    "url_base = 'https://raw.githubusercontent.com/thomas-haslwanter/statsintro_python/master/ipynb/Data/data_kaplan/'\n",
    "url = url_base + inFile\n",
    "data = pd.read_csv(urlopen(url))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "slideshow": {
     "slide_type": "subslide"
    }
   },
   "source": [
    "### OLS Model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": false,
    "slideshow": {
     "slide_type": "fragment"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "                            OLS Regression Results                            \n",
      "==============================================================================\n",
      "Dep. Variable:                   time   R-squared:                       0.287\n",
      "Model:                            OLS   Adj. R-squared:                  0.275\n",
      "Method:                 Least Squares   F-statistic:                     24.13\n",
      "Date:                Sat, 04 Feb 2017   Prob (F-statistic):           7.28e-06\n",
      "Time:                        14:21:20   Log-Likelihood:                -219.23\n",
      "No. Observations:                  62   AIC:                             442.5\n",
      "Df Residuals:                      60   BIC:                             446.7\n",
      "Df Model:                           1                                         \n",
      "Covariance Type:            nonrobust                                         \n",
      "==============================================================================\n",
      "                 coef    std err          t      P>|t|      [0.025      0.975]\n",
      "------------------------------------------------------------------------------\n",
      "Intercept     65.1923      1.517     42.986      0.000      62.159      68.226\n",
      "sex[T.M]     -10.5361      2.145     -4.912      0.000     -14.826      -6.246\n",
      "==============================================================================\n",
      "Omnibus:                       16.370   Durbin-Watson:                   0.353\n",
      "Prob(Omnibus):                  0.000   Jarque-Bera (JB):               19.838\n",
      "Skew:                           1.113   Prob(JB):                     4.92e-05\n",
      "Kurtosis:                       4.649   Cond. No.                         2.62\n",
      "==============================================================================\n",
      "\n",
      "Warnings:\n",
      "[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.\n"
     ]
    }
   ],
   "source": [
    "# Different models\n",
    "model1 = ols(\"time ~ sex\", data).fit()  # one factor\n",
    "print(model1.summary())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "slideshow": {
     "slide_type": "subslide"
    }
   },
   "source": [
    "### ANOVA"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": false,
    "slideshow": {
     "slide_type": "fragment"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "            df       sum_sq      mean_sq          F    PR(>F)\n",
      "sex        1.0  1720.655232  1720.655232  24.132575  0.000007\n",
      "Residual  60.0  4278.006477    71.300108        NaN       NaN\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "C:\\Programs\\WinPython-64bit-3.6.0.1Qt5\\python-3.6.0.amd64\\lib\\site-packages\\scipy\\stats\\_distn_infrastructure.py:875: RuntimeWarning: invalid value encountered in greater\n",
      "  return (self.a < x) & (x < self.b)\n",
      "C:\\Programs\\WinPython-64bit-3.6.0.1Qt5\\python-3.6.0.amd64\\lib\\site-packages\\scipy\\stats\\_distn_infrastructure.py:875: RuntimeWarning: invalid value encountered in less\n",
      "  return (self.a < x) & (x < self.b)\n",
      "C:\\Programs\\WinPython-64bit-3.6.0.1Qt5\\python-3.6.0.amd64\\lib\\site-packages\\scipy\\stats\\_distn_infrastructure.py:1814: RuntimeWarning: invalid value encountered in less_equal\n",
      "  cond2 = cond0 & (x <= self.a)\n"
     ]
    }
   ],
   "source": [
    "print(anova_lm(model1))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": false,
    "slideshow": {
     "slide_type": "subslide"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "                            OLS Regression Results                            \n",
      "==============================================================================\n",
      "Dep. Variable:                   time   R-squared:                       0.844\n",
      "Model:                            OLS   Adj. R-squared:                  0.839\n",
      "Method:                 Least Squares   F-statistic:                     159.6\n",
      "Date:                Sat, 04 Feb 2017   Prob (F-statistic):           1.58e-24\n",
      "Time:                        14:21:20   Log-Likelihood:                -172.12\n",
      "No. Observations:                  62   AIC:                             350.2\n",
      "Df Residuals:                      59   BIC:                             356.6\n",
      "Df Model:                           2                                         \n",
      "Covariance Type:            nonrobust                                         \n",
      "==============================================================================\n",
      "                 coef    std err          t      P>|t|      [0.025      0.975]\n",
      "------------------------------------------------------------------------------\n",
      "Intercept    555.7168     33.800     16.441      0.000     488.083     623.350\n",
      "sex[T.M]      -9.7980      1.013     -9.673      0.000     -11.825      -7.771\n",
      "year          -0.2515      0.017    -14.516      0.000      -0.286      -0.217\n",
      "==============================================================================\n",
      "Omnibus:                       52.546   Durbin-Watson:                   0.375\n",
      "Prob(Omnibus):                  0.000   Jarque-Bera (JB):              241.626\n",
      "Skew:                           2.430   Prob(JB):                     3.40e-53\n",
      "Kurtosis:                      11.362   Cond. No.                     1.30e+05\n",
      "==============================================================================\n",
      "\n",
      "Warnings:\n",
      "[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.\n",
      "[2] The condition number is large, 1.3e+05. This might indicate that there are\n",
      "strong multicollinearity or other numerical problems.\n"
     ]
    }
   ],
   "source": [
    "model2 = ols(\"time ~ sex + year\", data).fit()   # two factors\n",
    "print(model2.summary())\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "collapsed": false,
    "slideshow": {
     "slide_type": "subslide"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "            df       sum_sq      mean_sq           F        PR(>F)\n",
      "sex        1.0  1720.655232  1720.655232  108.479881  5.475511e-15\n",
      "year       1.0  3342.177104  3342.177104  210.709831  3.935386e-21\n",
      "Residual  59.0   935.829374    15.861515         NaN           NaN\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "C:\\Programs\\WinPython-64bit-3.6.0.1Qt5\\python-3.6.0.amd64\\lib\\site-packages\\scipy\\stats\\_distn_infrastructure.py:875: RuntimeWarning: invalid value encountered in greater\n",
      "  return (self.a < x) & (x < self.b)\n",
      "C:\\Programs\\WinPython-64bit-3.6.0.1Qt5\\python-3.6.0.amd64\\lib\\site-packages\\scipy\\stats\\_distn_infrastructure.py:875: RuntimeWarning: invalid value encountered in less\n",
      "  return (self.a < x) & (x < self.b)\n",
      "C:\\Programs\\WinPython-64bit-3.6.0.1Qt5\\python-3.6.0.amd64\\lib\\site-packages\\scipy\\stats\\_distn_infrastructure.py:1814: RuntimeWarning: invalid value encountered in less_equal\n",
      "  cond2 = cond0 & (x <= self.a)\n"
     ]
    }
   ],
   "source": [
    "print(anova_lm(model2))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "collapsed": false,
    "slideshow": {
     "slide_type": "subslide"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "                            OLS Regression Results                            \n",
      "==============================================================================\n",
      "Dep. Variable:                   time   R-squared:                       0.893\n",
      "Model:                            OLS   Adj. R-squared:                  0.888\n",
      "Method:                 Least Squares   F-statistic:                     162.1\n",
      "Date:                Sat, 04 Feb 2017   Prob (F-statistic):           3.67e-28\n",
      "Time:                        14:21:21   Log-Likelihood:                -160.30\n",
      "No. Observations:                  62   AIC:                             328.6\n",
      "Df Residuals:                      58   BIC:                             337.1\n",
      "Df Model:                           3                                         \n",
      "Covariance Type:            nonrobust                                         \n",
      "=================================================================================\n",
      "                    coef    std err          t      P>|t|      [0.025      0.975]\n",
      "---------------------------------------------------------------------------------\n",
      "Intercept       697.3012     39.221     17.779      0.000     618.791     775.811\n",
      "sex[T.M]       -302.4638     56.412     -5.362      0.000    -415.384    -189.544\n",
      "year             -0.3240      0.020    -16.118      0.000      -0.364      -0.284\n",
      "sex[T.M]:year     0.1499      0.029      5.189      0.000       0.092       0.208\n",
      "==============================================================================\n",
      "Omnibus:                       49.919   Durbin-Watson:                   0.575\n",
      "Prob(Omnibus):                  0.000   Jarque-Bera (JB):              243.008\n",
      "Skew:                           2.224   Prob(JB):                     1.70e-53\n",
      "Kurtosis:                      11.619   Cond. No.                     3.40e+05\n",
      "==============================================================================\n",
      "\n",
      "Warnings:\n",
      "[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.\n",
      "[2] The condition number is large, 3.4e+05. This might indicate that there are\n",
      "strong multicollinearity or other numerical problems.\n"
     ]
    }
   ],
   "source": [
    "model3 = ols(\"time ~ sex * year\", data).fit()   # two factors with interaction\n",
    "print(model3.summary())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "collapsed": false,
    "slideshow": {
     "slide_type": "subslide"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "            df       sum_sq      mean_sq           F        PR(>F)\n",
      "sex        1.0  1720.655232  1720.655232  156.140793  4.299569e-18\n",
      "year       1.0  3342.177104  3342.177104  303.285733  1.039245e-24\n",
      "sex:year   1.0   296.675432   296.675432   26.921801  2.826421e-06\n",
      "Residual  58.0   639.153942    11.019896         NaN           NaN\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "C:\\Programs\\WinPython-64bit-3.6.0.1Qt5\\python-3.6.0.amd64\\lib\\site-packages\\scipy\\stats\\_distn_infrastructure.py:875: RuntimeWarning: invalid value encountered in greater\n",
      "  return (self.a < x) & (x < self.b)\n",
      "C:\\Programs\\WinPython-64bit-3.6.0.1Qt5\\python-3.6.0.amd64\\lib\\site-packages\\scipy\\stats\\_distn_infrastructure.py:875: RuntimeWarning: invalid value encountered in less\n",
      "  return (self.a < x) & (x < self.b)\n",
      "C:\\Programs\\WinPython-64bit-3.6.0.1Qt5\\python-3.6.0.amd64\\lib\\site-packages\\scipy\\stats\\_distn_infrastructure.py:1814: RuntimeWarning: invalid value encountered in less_equal\n",
      "  cond2 = cond0 & (x <= self.a)\n"
     ]
    }
   ],
   "source": [
    "print(anova_lm(model3))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "slideshow": {
     "slide_type": "slide"
    }
   },
   "source": [
    "# Polynomial Regression\n",
    "\n",
    "Here we define the model directly through the design matrix. Similar to MATLAB's \"regress\" command."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "collapsed": false,
    "slideshow": {
     "slide_type": "fragment"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Summary:\n",
      "                            OLS Regression Results                            \n",
      "==============================================================================\n",
      "Dep. Variable:                      y   R-squared:                       0.993\n",
      "Model:                            OLS   Adj. R-squared:                  0.993\n",
      "Method:                 Least Squares   F-statistic:                     7046.\n",
      "Date:                Sat, 04 Feb 2017   Prob (F-statistic):          9.76e-106\n",
      "Time:                        14:21:21   Log-Likelihood:                -313.30\n",
      "No. Observations:                 100   AIC:                             632.6\n",
      "Df Residuals:                      97   BIC:                             640.4\n",
      "Df Model:                           2                                         \n",
      "Covariance Type:            nonrobust                                         \n",
      "==============================================================================\n",
      "                 coef    std err          t      P>|t|      [0.025      0.975]\n",
      "------------------------------------------------------------------------------\n",
      "const          3.7463      1.658      2.260      0.026       0.456       7.036\n",
      "x1             3.1027      0.774      4.009      0.000       1.567       4.639\n",
      "x2             1.9708      0.076     26.055      0.000       1.821       2.121\n",
      "==============================================================================\n",
      "Omnibus:                        1.436   Durbin-Watson:                   1.845\n",
      "Prob(Omnibus):                  0.488   Jarque-Bera (JB):                1.189\n",
      "Skew:                          -0.043   Prob(JB):                        0.552\n",
      "Kurtosis:                       2.473   Cond. No.                         142.\n",
      "==============================================================================\n",
      "\n",
      "Warnings:\n",
      "[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.\n"
     ]
    }
   ],
   "source": [
    "# Generate the data\n",
    "t = np.arange(0,10,0.1)\n",
    "y = 4 + 3*t + 2*t**2 + 5*np.random.randn(len(t))\n",
    "\n",
    "# Make the fit. Note that this is another \"OLS\" than the one in \"model_formulas\"!\n",
    "M = np.column_stack((np.ones(len(t)), t, t**2))\n",
    "res = sm.OLS(y, M).fit()\n",
    "    \n",
    "# Display the results\n",
    "print('Summary:')\n",
    "print(res.summary())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "collapsed": false,
    "slideshow": {
     "slide_type": "subslide"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "The fit parameters are: [ 3.74631449  3.10268496  1.97082745]\n",
      "The confidence intervals are:\n",
      "[[ 0.45628815  7.03634082]\n",
      " [ 1.56675671  4.6386132 ]\n",
      " [ 1.82070297  2.12095193]]\n"
     ]
    }
   ],
   "source": [
    "print('The fit parameters are: {0}'.format(str(res.params)))\n",
    "print('The confidence intervals are:')\n",
    "print(res.conf_int())"
   ]
  }
 ],
 "metadata": {
  "celltoolbar": "Slideshow",
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}
